In [ ]:
# COVID-19 Global Data Analysis
This project analyzes global COVID-19 case and death data from the WHO. It includes data cleaning, visualizations, country and regional comparisons, and geographic mapping using interactive choropleth charts.
Tools used: Python, Pandas, Seaborn, Matplotlib, Plotly.
Goals:
- Understand global trends and regional differences
- Identify hotspots and infection trajectories
- Normalize data using per capita metrics
In [13]:
import pandas as pd
covid_df= pd.read_csv("C:/Users/ADMIN/Desktop/pipi/covid/WHO-COVID-19-global-daily-data.csv")
#load the dataset
In [15]:
df.head(10).
### Initial Data Preview
Here we preview the first few rows of the dataset to understand its structure.
Out[15]:
| Date_reported | Country_code | Country | WHO_region | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2020-01-04 | VC | Saint Vincent and the Grenadines | AMR | NaN | 0 | NaN | 0 |
| 1 | 2020-01-04 | SN | Senegal | AFR | NaN | 0 | NaN | 0 |
| 2 | 2020-01-04 | SB | Solomon Islands | WPR | 0.0 | 0 | 0.0 | 0 |
| 3 | 2020-01-04 | LK | Sri Lanka | SEAR | 0.0 | 0 | 0.0 | 0 |
| 4 | 2020-01-04 | SY | Syrian Arab Republic | EMR | NaN | 0 | NaN | 0 |
| 5 | 2020-01-04 | TJ | Tajikistan | EUR | NaN | 0 | NaN | 0 |
| 6 | 2020-01-04 | TH | Thailand | SEAR | 0.0 | 0 | 0.0 | 0 |
| 7 | 2020-01-04 | AE | United Arab Emirates | EMR | NaN | 0 | NaN | 0 |
| 8 | 2020-01-04 | TZ | United Republic of Tanzania | AFR | NaN | 0 | NaN | 0 |
| 9 | 2020-01-04 | VE | Venezuela (Bolivarian Republic of) | AMR | NaN | 0 | NaN | 0 |
In [17]:
df.info
Out[17]:
<bound method DataFrame.info of Date_reported Country_code Country \
0 2020-01-04 VC Saint Vincent and the Grenadines
1 2020-01-04 SN Senegal
2 2020-01-04 SB Solomon Islands
3 2020-01-04 LK Sri Lanka
4 2020-01-04 SY Syrian Arab Republic
... ... ... ...
472555 2025-05-25 MX Mexico
472556 2025-05-25 MR Mauritania
472557 2025-05-25 ML Mali
472558 2025-05-25 ME Montenegro
472559 2025-05-25 NC New Caledonia
WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths
0 AMR NaN 0 NaN 0
1 AFR NaN 0 NaN 0
2 WPR 0.0 0 0.0 0
3 SEAR 0.0 0 0.0 0
4 EMR NaN 0 NaN 0
... ... ... ... ... ...
472555 AMR NaN 7622513 NaN 334818
472556 AFR NaN 63889 NaN 997
472557 AFR NaN 33193 NaN 743
472558 EUR NaN 251280 NaN 2654
472559 WPR NaN 80203 NaN 314
[472560 rows x 8 columns]>
In [9]:
df.describe
Out[9]:
<bound method NDFrame.describe of Date_reported Country_code Country \
0 2020-01-04 VC Saint Vincent and the Grenadines
1 2020-01-04 SN Senegal
2 2020-01-04 SB Solomon Islands
3 2020-01-04 LK Sri Lanka
4 2020-01-04 SY Syrian Arab Republic
... ... ... ...
472555 2025-05-25 MX Mexico
472556 2025-05-25 MR Mauritania
472557 2025-05-25 ML Mali
472558 2025-05-25 ME Montenegro
472559 2025-05-25 NC New Caledonia
WHO_region New_cases Cumulative_cases New_deaths Cumulative_deaths
0 AMR NaN 0 NaN 0
1 AFR NaN 0 NaN 0
2 WPR 0.0 0 0.0 0
3 SEAR 0.0 0 0.0 0
4 EMR NaN 0 NaN 0
... ... ... ... ... ...
472555 AMR NaN 7622513 NaN 334818
472556 AFR NaN 63889 NaN 997
472557 AFR NaN 33193 NaN 743
472558 EUR NaN 251280 NaN 2654
472559 WPR NaN 80203 NaN 314
[472560 rows x 8 columns]>
In [11]:
df.isnull().sum()
Out[11]:
Date_reported 0 Country_code 1969 Country 0 WHO_region 0 New_cases 263304 Cumulative_cases 0 New_deaths 319112 Cumulative_deaths 0 dtype: int64
In [21]:
## Data Cleaning
#In this step, we clean and prepare the COVID-19 dataset for analysis. This includes:
#Converting date columns
#Handling missing values
#Replacing negative values in case and death counts
In [23]:
df['Date_reported'] = pd.to_datetime(df['Date_reported']) #converting date columns
In [25]:
df.isnull().sum() #we check for missing data in all columns to determine if any rows need to be filled
Out[25]:
Date_reported 0 Country_code 1969 Country 0 WHO_region 0 New_cases 263304 Cumulative_cases 0 New_deaths 319112 Cumulative_deaths 0 dtype: int64
In [29]:
df[df['Country_code'].isna()].head(10) #checking the country codes to identify the missing values
Out[29]:
| Date_reported | Country_code | Country | WHO_region | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | |
|---|---|---|---|---|---|---|---|---|
| 207 | 2020-01-04 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 279 | 2020-01-05 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 688 | 2020-01-06 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 759 | 2020-01-07 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 1168 | 2020-01-08 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 1239 | 2020-01-09 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 1648 | 2020-01-10 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 1717 | 2020-01-11 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 2128 | 2020-01-12 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
| 2198 | 2020-01-13 | NaN | Namibia | AFR | NaN | 0 | NaN | 0 |
In [37]:
import pycountry
def get_country_code(name):
try:
return pycountry.countries.lookup(name).alpha_2
except LookupError:
return None
In [40]:
mask = df['Country_code'].isna()
df.loc[mask, 'Country_code'] = df.loc[mask,'Country'].apply(get_country_code)
### Automatically fill missing `Country_code` values using `Country` names
In [44]:
df['Country_code'].isna().sum() #confirms they are no missing values.
Out[44]:
0
In [54]:
missing_percentage = df[['New_cases', 'New_deaths']].isna().mean() * 100
print(missing_percentage) #more than half of the new cases and new deaths were missing
#used linear interpolation by country to fill in missing values
New_cases 55.718639 New_deaths 67.528356 dtype: float64
In [58]:
df['New_cases'] = df.groupby('Country')['New_cases'].transform(lambda x: x.interpolate(method='linear'))
df['New_deaths'] = df.groupby('Country')['New_deaths'].transform(lambda x: x.interpolate(method='linear'))
#used linear interpolation by country to fill in missing value
In [60]:
print(df[['New_cases', 'New_deaths']].isna().sum())
New_cases 17188 New_deaths 40146 dtype: int64
In [62]:
df['New_cases'] = df['New_cases'].fillna(0)
df['New_deaths'] = df['New_deaths'].fillna(0) #filled with 0 to make the data complete
In [78]:
df.isnull().sum() #confirmation
Out[78]:
Date_reported 0 Country_code 0 Country 0 WHO_region 0 New_cases 0 Cumulative_cases 0 New_deaths 0 Cumulative_deaths 0 dtype: int64
In [90]:
numeric_df = df.select_dtypes(include='number')
#selecting only numeric columns
In [92]:
negative_counts = (numeric_df < 0).sum()
print(negative_counts)
# checking for negative values
New_cases 743 Cumulative_cases 0 New_deaths 0 Cumulative_deaths 0 dtype: int64
In [94]:
df['New_cases'] = df['New_cases'].apply(lambda x: max(x, 0))
#remove the negative values
In [96]:
print((df['New_cases'] < 0).sum()) # Should return 0
#confirmation
0
In [98]:
df.describe()
Out[98]:
| Date_reported | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | |
|---|---|---|---|---|---|
| count | 472560 | 4.725600e+05 | 4.725600e+05 | 472560.000000 | 4.725600e+05 |
| mean | 2022-09-14 00:00:00.000000256 | 3.742682e+03 | 2.001990e+06 | 29.733821 | 2.139459e+04 |
| min | 2020-01-04 00:00:00 | 0.000000e+00 | 0.000000e+00 | 0.000000 | 0.000000e+00 |
| 25% | 2021-05-10 00:00:00 | 2.000000e+00 | 6.265000e+03 | 0.000000 | 3.800000e+01 |
| 50% | 2022-09-14 00:00:00 | 2.800000e+01 | 5.714900e+04 | 1.000000 | 6.810000e+02 |
| 75% | 2024-01-19 00:00:00 | 3.300000e+02 | 6.432410e+05 | 4.000000 | 7.693000e+03 |
| max | 2025-05-25 00:00:00 | 6.966046e+06 | 1.034368e+08 | 44047.000000 | 1.224213e+06 |
| std | NaN | 4.448369e+04 | 8.405211e+06 | 201.013603 | 8.597986e+04 |
In [109]:
high_case_threshold = 1000000
high_case_df=df[df['New_cases'] > high_case_threshold]
In [111]:
high_case_df.sort_values('New_cases', ascending=False).head(10)
#reviewing days with extremely high new case counts
Out[111]:
| Date_reported | Country_code | Country | WHO_region | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | |
|---|---|---|---|---|---|---|---|---|
| 260351 | 2022-12-23 | CN | China | WPR | 6966046.0 | 50447985 | 894.0 | 36318 |
| 259975 | 2022-12-22 | CN | China | WPR | 6434648.0 | 43481939 | 836.0 | 35424 |
| 260456 | 2022-12-24 | CN | China | WPR | 6327801.0 | 56775786 | 1308.0 | 37626 |
| 259872 | 2022-12-21 | CN | China | WPR | 5905312.0 | 37047291 | 628.0 | 34588 |
| 260831 | 2022-12-25 | CN | China | WPR | 5669864.0 | 62445650 | 1369.0 | 38995 |
| 259495 | 2022-12-20 | CN | China | WPR | 5102957.0 | 31141979 | 454.0 | 33960 |
| 260936 | 2022-12-26 | CN | China | WPR | 4768272.0 | 67213922 | 1394.0 | 40389 |
| 261311 | 2022-12-27 | CN | China | WPR | 4462481.0 | 71676403 | 1416.0 | 41805 |
| 261416 | 2022-12-28 | CN | China | WPR | 4356772.0 | 76033175 | 1845.0 | 43650 |
| 259392 | 2022-12-19 | CN | China | WPR | 4068849.0 | 26039022 | 360.0 | 33506 |
In [123]:
df['high_case_flag'] = df['New_cases'] > 100000 #flagging unusually high case counts
In [125]:
df[df['high_case_flag'] == True]
Out[125]:
| Date_reported | Country_code | Country | WHO_region | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | high_case_flag | high_death_flag | |
|---|---|---|---|---|---|---|---|---|---|---|
| 67033 | 2020-10-09 | FR | France | EUR | 104347.571429 | 463008 | 653.142857 | 22134 | True | True |
| 67366 | 2020-10-10 | FR | France | EUR | 110253.285714 | 463008 | 671.571429 | 22134 | True | True |
| 67512 | 2020-10-11 | FR | France | EUR | 116159.000000 | 579167 | 690.000000 | 22824 | True | True |
| 67847 | 2020-10-12 | FR | France | EUR | 123541.285714 | 579167 | 729.000000 | 22824 | True | True |
| 67992 | 2020-10-13 | FR | France | EUR | 130923.571429 | 579167 | 768.000000 | 22824 | True | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 343252 | 2023-12-04 | RU | Russian Federation | EUR | 107174.000000 | 23511820 | 148.000000 | 400803 | True | False |
| 343621 | 2023-12-05 | RU | Russian Federation | EUR | 105706.714286 | 23511820 | 150.285714 | 400803 | True | False |
| 343732 | 2023-12-06 | RU | Russian Federation | EUR | 104239.428571 | 23511820 | 152.571429 | 400803 | True | False |
| 344101 | 2023-12-07 | RU | Russian Federation | EUR | 102772.142857 | 23511820 | 154.857143 | 400803 | True | False |
| 344214 | 2023-12-08 | RU | Russian Federation | EUR | 101304.857143 | 23511820 | 157.142857 | 400803 | True | False |
3060 rows × 10 columns
In [121]:
df['high_death_flag'] = df['New_deaths'] > 200
# Flagging Unusual Daily Death Counts
#Using a threshold of `New_deaths > 200` we flag rows with unusually high daily deaths. These are likely outliers due to data dumps or reporting issues.
In [119]:
df[df['high_death_flag'] == True]
Out[119]:
| Date_reported | Country_code | Country | WHO_region | New_cases | Cumulative_cases | New_deaths | Cumulative_deaths | high_case_flag | high_death_flag | |
|---|---|---|---|---|---|---|---|---|---|---|
| 17634 | 2020-03-17 | ES | Spain | EUR | 18672.428571 | 13174 | 1021.285714 | 482 | False | True |
| 17915 | 2020-03-18 | ES | Spain | EUR | 22207.142857 | 13174 | 1303.428571 | 482 | False | True |
| 18114 | 2020-03-19 | ES | Spain | EUR | 25741.857143 | 13174 | 1585.571429 | 482 | False | True |
| 18393 | 2020-03-20 | ES | Spain | EUR | 29276.571429 | 13174 | 1867.714286 | 482 | False | True |
| 18595 | 2020-03-21 | ES | Spain | EUR | 32811.285714 | 13174 | 2149.857143 | 482 | False | True |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 439734 | 2025-01-09 | US | United States of America | AMR | 93260.000000 | 103436829 | 1034.000000 | 1213913 | False | True |
| 440123 | 2025-01-10 | US | United States of America | AMR | 93260.000000 | 103436829 | 1027.000000 | 1213913 | False | True |
| 440215 | 2025-01-11 | US | United States of America | AMR | 93260.000000 | 103436829 | 1020.000000 | 1213913 | False | True |
| 440603 | 2025-01-12 | US | United States of America | AMR | 93260.000000 | 103436829 | 1013.000000 | 1213913 | False | True |
| 440694 | 2025-01-13 | US | United States of America | AMR | 93260.000000 | 103436829 | 1006.000000 | 1213913 | False | True |
2883 rows × 10 columns
In [127]:
df.drop_duplicates(inplace=True) #dropping duplicates
In [129]:
df.duplicated().sum()
Out[129]:
0
In [131]:
daily_cases = df.groupby('Date_reported')['New_cases'].sum().reset_index()
plt.figure(figsize=(14,6))
plt.plot(daily_cases['Date_reported'], daily_cases['New_cases'], color='blue', linewidth=1.5)
plt.title('Global Daily New COVID-19 Cases')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.grid(True)
plt.tight_layout()
plt.show()
#Global Daily New Cases
In [139]:
daily_deaths= df.groupby('Date_reported')['New_deaths'].sum().reset_index()
plt.figure(figsize=(14,6))
plt.plot(daily_deaths['Date_reported'], daily_cases['New_cases'], color='red',linewidth=1.5)
plt.title('Global Daily COVID-19 deaths')
plt.xlabel('Date')
plt.ylabel('New Deaths')
plt.grid(True)
plt.tight_layout()
plt.show()
#global daily deaths
In [157]:
fig, ax1 = plt.subplots(figsize=(14, 6))
# Primary y-axis for cases
ax1.plot(daily_summary['Date_reported'], daily_summary['Cumulative_cases'], color='blue', label='Cumulative Cases')
ax1.set_xlabel('Date')
ax1.set_ylabel('Cumulative Cases', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')
# Secondary y-axis for deaths
ax2 = ax1.twinx()
ax2.plot(daily_summary['Date_reported'], daily_summary['Cumulative_deaths'], color='red', label='Cumulative Deaths')
ax2.set_ylabel('Cumulative Deaths', color='red')
ax2.tick_params(axis='y', labelcolor='red')
plt.title('Global Cumulative COVID-19 Cases vs Deaths Over Time')
fig.tight_layout()
plt.grid(True)
plt.show()
# Cumulative Cases vs Deaths
In [167]:
latest_date = df['Date_reported'].max()
top10 = df[df['Date_reported'] == latest_date]\
.groupby('Country')['Cumulative_cases'].sum()\
.sort_values(ascending = False)\
.head(10)
top10.plot(kind='bar', color='orange', figsize=(15,10))
plt.title('Top 10 Countries by Cumulative Cases (Latest Date)')
plt.ylabel('Cumulative Cases')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
#Top 10 Countries by Cumulative Cases
In [181]:
from scipy.signal import find_peaks
In [185]:
daily= df.groupby('Date_reported')[['New_cases', 'New_deaths']].sum().reset_index()
# Detect peaks in new cases and new deaths
peaks_cases, _ = find_peaks(daily['New_cases'], distance=20, prominence=1e4)
peaks_deaths, _ = find_peaks(daily['New_deaths'], distance=20, prominence=500)
In [187]:
plt.figure(figsize=(14, 6))
plt.plot(daily['Date_reported'], daily['New_cases'], label='New Cases', color='blue')
plt.plot(daily['Date_reported'].iloc[peaks_cases], daily['New_cases'].iloc[peaks_cases], "x", label='Peaks', color='black')
plt.title("Global Daily New COVID-19 Cases with Peaks")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [189]:
plt.figure(figsize=(14, 6))
plt.plot(daily['Date_reported'], daily['New_deaths'], label='New Deaths', color='red')
plt.plot(daily['Date_reported'].iloc[peaks_deaths], daily['New_deaths'].iloc[peaks_deaths], "x", label='Peaks', color='black')
plt.title("Global Daily New COVID-19 Deaths with Peaks")
plt.xlabel("Date")
plt.ylabel("New Deaths")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()
In [191]:
# Drop accidental duplicate column if it exists
if 'New_Cases' in df.columns:
df.drop(columns='New_Cases', inplace=True)
In [197]:
# Ensure date is datetime
df['Date_reported'] = pd.to_datetime(df['Date_reported'])
# Get top 10 countries by total cumulative cases
top_10_countries = df.groupby('Country')['Cumulative_cases'].max().sort_values(ascending=False).head(10).index
# Filter data for just those countries
top_10_df = df[df['Country'].isin(top_10_countries)]
# Plot
plt.figure(figsize=(14, 6))
for country in top_10_countries:
country_data = top_10_df[top_10_df['Country'] == country]
plt.plot(country_data['Date_reported'], country_data['Cumulative_cases'], label=country)
plt.title('Cumulative COVID-19 Cases Over Time - Top 10 Countries')
plt.xlabel('Date')
plt.ylabel('Cumulative Cases')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()
In [201]:
region_peaks = df.groupby('WHO_region')['New_cases'].max().sort_values(ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x=region_peaks.values, y=region_peaks.index, palette='magma')
plt.title('Peak Daily New Cases by WHO Region')
plt.xlabel('Peak New Cases')
plt.ylabel('WHO Region')
plt.tight_layout()
plt.show().
#Group by WHO region and get peak new cases per region
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_2016\310614675.py:3: FutureWarning: Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect. sns.barplot(x=region_peaks.values, y=region_peaks.index, palette='magma')
In [205]:
region_daily = df.groupby(['Date_reported', 'WHO_region'])['New_cases'].sum().reset_index()
plt.figure(figsize=(14,6))
for region in df['WHO_region'].unique():
region_data = region_daily[region_daily['WHO_region'] == region]
plt.plot(region_data['Date_reported'], region_data['New_cases'], label=region)
plt.title('Daily New COVID-19 Cases by WHO Region')
plt.xlabel('Date')
plt.ylabel('New Cases')
plt.legend()
plt.tight_layout()
plt.show()
#compares total dailly new cases across WHO regions which were most affected overtime
In [209]:
import plotly.express as px
# Get total cases per country
total_cases_by_country = df.groupby('Country')['Cumulative_cases'].max().reset_index()
# Rename for clarity
total_cases_by_country.columns = ['Country', 'Total_Cases']
In [211]:
fig = px.choropleth(
total_cases_by_country,
locations='Country',
locationmode='country names',
color='Total_Cases',
hover_name='Country',
color_continuous_scale='Reds',
title='Total COVID-19 Cases by Country'
)
fig.update_layout(geo=dict(showframe=False, showcoastlines=False))
fig.show()
### World Map: Total COVID-19 Cases by Country
#This interactive choropleth map visualizes the total cumulative COVID-19 cases by country. Darker red areas represent countries with higher case counts.
In [213]:
# Get total deaths per country
total_deaths_by_country = df.groupby('Country')['Cumulative_deaths'].max().reset_index()
# Rename for clarity
total_deaths_by_country.columns = ['Country', 'Total_Deaths']
In [215]:
fig = px.choropleth(
total_deaths_by_country,
locations='Country',
locationmode='country names',
color='Total_Deaths',
hover_name='Country',
color_continuous_scale='Blues',
title='Total COVID-19 Deaths by Country'
)
fig.update_layout(geo=dict(showframe=False, showcoastlines=False))
fig.show()
#### World Map: Total COVID-19 Deaths by Country
#This interactive choropleth map shows the total number of COVID-19 deaths reported by each country. Darker blue shades indicate higher death tolls.
In [223]:
import os
print(os.getcwd())
C:\Users\ADMIN
In [ ]: